%pip install azure-ai-textanalytics pdfplumber Unidecode python-dotenv“Azure Text Summarization”
“How to use Azure Text Summarization with PDF, TXT and simple text”
- toc: false
- branch: master
- badges: true
- comments: true
- categories: [azure, cognitive services, summarization]
- hide: true
- search_exclude: false
- metadata_key1: metadata_value1
- metadata_key2: metadata_value2
from typing import List
import pdfplumber
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
from azure.ai.textanalytics import ExtractSummaryAction
from dotenv import load_dotenv
import os
from unidecode import unidecodeDOTENV_FILEPATH = ''
CS_ENDPOINT = os.getenv('CV_ENDPOINT')
CS_KEY = os.getenv('CV_KEY')
# https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/textanalytics/azure-ai-textanalytics/samples/sample_extract_summary.py
def pdf_parser(
filepath: str,
x_tolerance=1,
y_tolerance=1
) -> List[str]:
texts = []
with pdfplumber.open(filepath) as pdf:
for page in pdf.pages:
texts.append(unidecode(page.extract_text(x_tolerance=x_tolerance, y_tolerance=y_tolerance)))
return texts
def split_in_chunks(lst, chunk_size: int):
chunked_list = list()
for i in range(0, len(lst), chunk_size):
chunked_list.append(lst[i:i+chunk_size])
return chunked_list
def az_summary(
texts: List[str],
cs_endpoint: str,
cs_key: str,
language: str
):
az_doc = []
for i in range(len(texts)):
doc = {"id": i, "language": language, "text": texts[i]}
az_doc.append(doc)
break
text_analytics_client = TextAnalyticsClient(
endpoint=cs_endpoint,
credential=AzureKeyCredential(cs_key),
)
poller = text_analytics_client.begin_analyze_actions(
documents=texts,
actions=[
ExtractSummaryAction(order_by='rank'),
],
)
extract_summary_results = []
document_results = poller.result()
for result in document_results:
for ex in result:
# print(result[0])
if not ex['is_error']:
extract_summary_results.append(ex)
return extract_summary_results
def summarize(summaries, thr=0):
sentences = []
for sr in summaries:
for sentence in sr.sentences:
if sentence.rank_score >= thr:
sentences.append(sentence.text)
sentences = list(set(sentences))
return sentences
def summarize_pdf(
filepath: str,
cs_endpoint: str,
cs_key: str,
language: str,
thr=0
):
pdf_text = pdf_parser(filepath=filepath)
chunks = split_in_chunks(
lst=pdf_text,
chunk_size=25
)
summaries = []
for texts in chunks:
st = az_summary(
texts=texts,
cs_endpoint=cs_endpoint,
cs_key=cs_key,
language=language
)
summaries.extend(st)
sentences = summarize(summaries, thr)
return sentences
def summarize_txt(
filepath: str,
cs_endpoint: str,
cs_key: str,
language: str,
thr=0
):
with open(filepath, 'r', encoding='utf-8') as fh:
num_list = fh.read()
summary = az_summary(
texts=[num_list],
cs_endpoint=cs_endpoint,
cs_key=cs_key,
language=language
)
sentences = summarize(summary, thr)
return sentencesload_dotenv(DOTENV_FILEPATH)summary_pdf = summarize_pdf(
filepath='my_sample.pdf',
cs_endpoint=CS_ENDPOINT,
cs_key=CS_KEY,
language='en',
thr=0.5
)
print(summary_pdf)summary_txt = summarize_txt(
filepath='my_sample.txt',
cs_endpoint=CS_ENDPOINT,
cs_key=CS_KEY,
language='en',
thr=0.5
)
print(summary_txt)summary_text = az_summary(
texts=["""My sample text"""],
cs_endpoint=CS_ENDPOINT,
cs_key=CS_KEY,
language='en',
thr=0.5
)
print(summary_text)